message("Loading utilities")

library(tidyverse)
library(powerjoin)
## estimation
library(estimatr)
library(equivalence)
library(lmtest)
library(sandwich)
library(tidymodels)
library(broom)
library(zaminfluence)
# visualisation
library(ggplot2)
library(ggrepel)
library(ggimage)
library(latex2exp)
# text-as-data
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
# other
library(foreach)
library(pryr)
library(data.table)
library(pbapply)
library(magrittr)

options(readr.show_col_types = FALSE)

select <- dplyr::select

#####------------------------------------------------------#
##### GLOBALS ####
#####------------------------------------------------------#

GLOBAL_ALPHA_THRESHOLD = 0.05

LOG_OFFSET = 0.1

SUPPORT_RGX = "(support|stand with|stand for|stand behind|salute|in solidarity) "
OPPOSE_RGX = "(oppose|condemn|stand against|reject) "

N_CONGRESSIONAL_PHRASES = 500

## for scaling, the minimum unique number of brands/phrases
## per phrases/brands to include said brand/phrase
MIN_UNIQUE_BRANDS_PER_PHRASE = 8
MIN_UNIQUE_PHRASES_PER_BRAND = 8

## for scaling, the minimum total count of brands/phrases
## per phrases/brands to include said brand/phrase
MIN_TOTAL_BRAND_COUNT_PER_PHRASE = 8
MIN_TOTAL_PHRASE_COUNT_PER_BRAND = 8

PROXIMITY_DAYS = 3

STUDY_END_YMD = "2022-12-01"
STUDY_START_YMD = "2014-12-01"
STUDY_YEARS = tryCatch({
  as.numeric(substr(STUDY_START_YMD, 1, 4)):as.numeric(substr(STUDY_END_YMD, 1, 4))
}, error = function(e) {
  2014:2021
})

EMOJI_FLAG_FPATH = "graphics/emoji_flag.png"
EMOJI_FLAG_THIN_FPATH = "graphics/emoji_flag_thin.png"

## text dictionary categories to use later
text_cols =  c("text.believe","text.market","text.celebrate","text.charity",
               "text.donation","text.goals_mission","text.invest","text.honor",
               "text.info","text.pride","text.oppose","text.support",
               "text.mtwabp","text.voluntarism","text.value","text.philanthropy")

trans_f <- function(x) asinh(x/2)/log(10)
inv_f <- function(y) 2 * sinh(y * log(10))

pseudolog_trans <- scales::trans_new(name = 'pseudo log',
                                     transform = trans_f, 
                                     inverse   = inv_f,
                                     domain    = c(-Inf,Inf))

#####------------------------------------------------------#
##### FUNCTIONS ####
#####------------------------------------------------------#

## ++ Plotting ---------------------------------------------

OPEN_FILE_DEFAULT = TRUE

ggsave_v <- function(p = last_plot(), filename, open.file=OPEN_FILE_DEFAULT, ...) {
  dirpath <- dirname(filename)
  if (!dir.exists(dirpath)) {
    message(sprintf("making folder `%s`",dirpath))
    dir.create(dirpath)
  }
  suppressWarnings(ggsave(p, filename = filename, ...))
  message(sprintf("saved to `%s`",filename))
  
  if (open.file) {
  	system(sprintf("open '%s'",filename))
  }
}

histogram_plot <- function(df, 
                           x.col, group.col, 
                           group.lab = NA, x.lab = "", y.lab = "Density",
                           stats.lbl.loc = "top left") {
  df$x <- df[[x.col]]
  df$g <- df[[group.col]]
  if (is.na(group.lab)) {
    group.lab <- paste0(group.col,":")
  }
  
  x.means <- df %>%
    group_by(g) %>%
    summarise(x = mean(x, na.rm=T), .groups = "drop")
  
  if (length(unique(df$g)) == 2) {
    fit <- lm_robust(x ~ g, data = df)
    t.stat <- tidy(fit)[[2,4]]
    p.val <- tidy(fit)[[2,5]]
    stats.lbl <- sprintf("t = %0.2f, p = %0.2f", t.stat, p.val)
  } else {
    fit <- NULL
    t.stat <- NULL
    p.val <- NULL
    stats.lbl <- ""
  }
  
  annot <- data.frame(
    #left-bottom, left-top, right-bottom, right-top
    xpos = c(-Inf, -Inf, Inf, Inf), 
    ypos =  c(-Inf, Inf, -Inf, Inf), 
    hjustvar = c(-.15,   #shifts bottom left 'Text' to the right; make more negative to move it further right
                 -.15,   #shifts top left 'tExt' to the right; make more negative to move it further right
                 1.15,   #shifts bottom right 'teXt' to the left; make more positive to move it further left
                 1.15),  #shifts top right 'texT' to the left; make more positive to move it further left
    vjustvar = c(-1,    #shifts bottom left 'Text' upward; make more negative to move it further up
                 2,     #shifts top left 'tExt' downward; make more positive to move it further down
                 -1,    #shifts bottom right 'teXt' upward; make more negative to move it further up
                 2)     #shifts top right 'texT' downward; make more positive to move it further down
  )
  
  if (length(unique(df$g)) > 1 & !is.na(p.val)) {
    p <- ggplot(df, aes(x=x, fill=g))
  } else {
    p <- ggplot(df, aes(x=x), fill="gray")
  }
  p <- p + 
    geom_density(alpha = 0.2, aes(y=..scaled..))
  
  if (length(unique(df$g)) > 1 & !is.na(p.val)) {
    p <- p + geom_vline(data = x.means, aes(xintercept = x, color = g))
  } else {
    p <- p + geom_vline(xintercept = mean(df$x, na.rm=T))
  }
  p <- p + 
    labs(fill = group.lab, color = group.lab) +
    xlab(x.lab) + 
    ylab(y.lab) +
    theme(legend.position = "top")
  
  if (stats.lbl.loc == "bottom left") {
    p <- p + annotate("text", x=annot$xpos[1], y=annot$ypos[1], hjust=annot$hjustvar[1], vjust=annot$vjustvar[1], label=stats.lbl, fontface="bold")
  }
  
  if (stats.lbl.loc == "top left") {
    p <- p + annotate("text", x=annot$xpos[2], y=annot$ypos[2], hjust=annot$hjustvar[2], vjust=annot$vjustvar[2], label=stats.lbl, fontface="bold")
  }
  
  if (stats.lbl.loc == "bottom right") {
    p <- p + annotate("text", x=annot$xpos[3], y=annot$ypos[3], hjust=annot$hjustvar[3], vjust=annot$vjustvar[3], label=stats.lbl, fontface="bold")
  }
  
  if (stats.lbl.loc == "top right") {
    p <- p + annotate("text", x=annot$xpos[4], y=annot$ypos[4], hjust=annot$hjustvar[4], vjust=annot$vjustvar[4], label=stats.lbl, fontface="bold")
  }
  
  return(list(plot=p,
              t.test=list(
                lm.fit = fit,
                t.stat = t.stat,
                p.val = p.val
              )))
}

alignment_gridplot <- function(df, grid.col, grid.ncols = 2, ...) {
  plist <- list()
  outlist <- list()
  grid_vals <- unique(sort(df[[grid.col]]))
  for (g in grid_vals) {
    message(g)
    df_g   <- df[df[[grid.col]] == g,]
    df_g$g <- g
    o_g    <- measure_alignment(df_g, ...)
    p_g    <- o_g$plot
    plist[[g]]   <- p_g + facet_wrap( ~ g)
    outlist[[g]] <- o_g
  }
  # p <- gridExtra::grid.arrange(grobs = plist, ncol = grid.ncols)
  p <- cowplot::plot_grid(plotlist = plist, ncol = grid.ncols)
  # p
  return(list(plot=p,
              plot.list=plist,
              out.list=outlist))
}

measure_alignment <- function(df,
                              align.condition = NA,
                              x.col, y.col, 
                              x.lab = NA, y.lab = NA, 
                              size.col = NA, lbl.col = NA, 
                              x.mid = 0, y.mid = 0.5, 
                              x.lim = NA, y.lim = NA, 
                              max.overlaps = 5,
                              text.size = 3, 
							  r.caption.size = 9,
                              point.alpha = 0.5,
                              hist.missing = FALSE, ### show histogram of missing points
                              annot.points = TRUE,
                              annot.quads = TRUE, 
                              annot.quads.colored = TRUE,
                              ...) {
  df$x <- as.numeric(df[[x.col]])
  df$y <- as.numeric(df[[y.col]])
  
  missing.x <- sum(is.na(df$x))
  missing.y <- sum(is.na(df$y))
  
  ## ------------ Make histograms to examine missingness (optional)
  p.hist.miss.x <- NULL
  p.hist.miss.y <- NULL
  
  if (hist.missing & !(missing.x==0)) {
    message(sprintf("Making histogram of `%s` for `%s`-missing vs. not `%s`-missing", y.col, x.col, x.col))
    p.hist.miss.x <- df %>%
      mutate(x.missing = is.na(x)) %>%
      histogram_plot(x.col = "y", group.col="x.missing", group.lab = paste(x.lab, "Missing?\n"), x.lab = y.lab)
  }
  if (hist.missing & !(missing.y==0)) {
    message(sprintf("Making histogram of `%s` for `%s`-missing vs. not `%s`-missing", x.col, y.col, y.col))
    p.hist.miss.y <- df %>%
      mutate(y.missing = is.na(y)) %>%
      histogram_plot(x.col = "x", group.col="y.missing", group.lab = paste(y.lab, "Missing?\n"), x.lab = x.lab)
  }
  
  ## ------------ Subset data
  df <- filter(df, !is.na(x), !is.na(y))
  # df <- subset(df, align.condition)
  suppressWarnings({
    if (!any(is.na(as.character(enquo(align.condition))))) { ## http://adv-r.had.co.nz/Computing-on-the-language.html
      condition_call <- substitute(align.condition)
      n0 <- nrow(df)
      r <- eval(condition_call, df)
      if (!any(is.na(r))) {
        df <- df[r, ]
        n1 <- nrow(df)
        message(sprintf("Subsetting drops data from %i to %i", n0, n1))
      }
    }
  })
  
  if (is.na(size.col)) {
    df$size <- 1
  } else {
    df$size <- df[[size.col]]
  }
  if (is.na(lbl.col)) {
    df$lbl <- NA
  } else {
    df$lbl <- df[[lbl.col]]
  }  
  
  ## ------------ Compute statistics
  
  #### t-test
  ols.robust.fit <- lm_robust(y ~ x, data = df)
  ols.robust.fit.summary <- summary(ols.robust.fit)
  ols.robust.pval <- case_when(ols.robust.fit.summary$coefficients[2,4] > 0.05 ~ "",
                               ols.robust.fit.summary$coefficients[2,4] > 0.01 ~ "*",
                               ols.robust.fit.summary$coefficients[2,4] > 0.001 ~ "**",
                               TRUE ~ "***")
  #### correlations
  rho          <- cor(df$x, df$y, use="pairwise.complete.obs")
  rho.kendall  <- cor(df$x, df$y, use="pairwise.complete.obs", method="kendall")
  rho.spearman <- cor(df$x, df$y, use="pairwise.complete.obs", method="spearman")
  
  #### quadrant count ratio (QCR)
  q4 <- sum(df$x > x.mid & df$y < y.mid,na.rm=T)
  q3 <- sum(df$x < x.mid & df$y < y.mid,na.rm=T)
  q2 <- sum(df$x < x.mid & df$y > y.mid,na.rm=T)
  q1 <- sum(df$x > x.mid & df$y > y.mid,na.rm=T)
  qcr <- ((q1+q3) - (q2+q4))/(q1+q2+q3+q4)
  
  stats <- list()
  stats$qcr <- qcr
  stats$rho <- rho
  stats$rho.kendall <- rho.kendall
  stats$rho.spearman <- rho.spearman
  stats$ols.robust.pval <- ols.robust.pval
  
  if (annot.quads) {
    annot <- data.frame(
      #left-bottom, left-top, right-bottom, right-top
      color = c("blue","grey20","grey20","red"),
      xpos = c(-Inf, -Inf, Inf, Inf), 
      ypos =  c(-Inf, Inf, -Inf, Inf),
      annotateText = c(
        paste0(round(mean(df$x < x.mid & df$y < y.mid,na.rm=T),2)*100,"%"), 
        paste0(round(mean(df$x < x.mid & df$y > y.mid,na.rm=T),2)*100,"%"),
        paste0(round(mean(df$x > x.mid & df$y < y.mid,na.rm=T),2)*100,"%"),
        paste0(round(mean(df$x > x.mid & df$y > y.mid,na.rm=T),2)*100,"%")    
      ),
      hjustvar = c(-.25,   #shifts bottom left 'Text' to the right; make more negative to move it further right
                   -.25,   #shifts top left 'tExt' to the right; make more negative to move it further right
                   1.25,   #shifts bottom right 'teXt' to the left; make more positive to move it further left
                   1.25),  #shifts top right 'texT' to the left; make more positive to move it further left
      vjustvar = c(-.75,    #shifts bottom left 'Text' upward; make more negative to move it further up
                   1.75,     #shifts top left 'tExt' downward; make more positive to move it further down
                   -.75,    #shifts bottom right 'teXt' upward; make more negative to move it further up
                   1.75)     #shifts top right 'texT' downward; make more positive to move it further down
    )
  }
  
  ## ------------ Make correlation plot
  p <- ggplot(df, aes(x=x, y=y))
  
  if (!suppressWarnings(is.na(as.character(x.lab)))) { 
    p <- p + xlab(x.lab) 
  }
  if (!suppressWarnings(is.na(as.character(y.lab)))) { 
    p <- p + ylab(y.lab) 
  }  
  if (any(!is.na(x.lim))) { 
    p <- p + xlim(x.lim) 
  }
  if (!any(is.na(y.lim))) { 
    p <- p + ylim(y.lim) 
  }
  if (!any(is.na(size.col))) {
    p <- p + geom_point(aes(size=size), alpha=point.alpha, shape=1) 
  } else {
    p <- p + geom_point(alpha=point.alpha, shape=1)    
  }
  
  p <- p +
    stat_smooth(method = "loess", colour="orange", se=F) + 
    stat_smooth(method = "lm", colour="purple", se=F)
  
  if (annot.points) {
    p <- p + geom_text_repel(aes(label=lbl), max.overlaps=max.overlaps, size=text.size, ...)  
  }
  
  if (annot.quads & !annot.quads.colored) {
    p <- p + geom_label(data = annot, aes(x = xpos, y = ypos, hjust = hjustvar, vjust = vjustvar, label = annotateText), fontface="bold")
  } else  if (annot.quads & annot.quads.colored) {
    p <- p + geom_label(data = annot, aes(x = xpos, y = ypos, hjust = hjustvar, vjust = vjustvar, label = annotateText, fill = color), colour="white", fontface="bold", label.size=1) + scale_fill_identity()
  }
  
  if (!is.na(x.mid)) { 
    p <- p + geom_vline(xintercept = x.mid, lty=2, alpha=1)
  }
  if (!is.na(y.mid)) { 
    p <- p + geom_hline(yintercept = y.mid, lty=2, alpha=1)
  }
  
  p <- p +
    theme_bw() +
    labs(caption = TeX(sprintf("\\textbf{$\\r$ = %0.3f%s}", rho, ols.robust.pval))) +
    theme(legend.position = "none",
          plot.caption = element_text(size=r.caption.size, face="bold"),
          axis.title = element_text(size=10))
  
  out <- list()
  out$plot           <- p
  out$ols.robust.fit <- ols.robust.fit
  out$stats          <- stats
  
  if (hist.missing) {
    out$p.hist.miss.x <- p.hist.miss.x
    out$p.hist.miss.y <- p.hist.miss.y
    plot.hist.miss <- cowplot::plot_grid(out$p.hist.miss.x$plot +
                                           theme(legend.position="top", 
                                                 legend.box="horizontal",
                                                 legend.text.align = 0.5,
                                                 legend.title.align = 0.5,
                                                 legend.title=element_text(size=8)), 
                                         out$p.hist.miss.y$plot +
                                           theme(legend.position="top", 
                                                 legend.box="horizontal",
                                                 legend.text.align = 0.5,
                                                 legend.title.align = 0.5,
                                                 legend.title=element_text(size=8)), nrow=2)
    out$plot.miss   <- plot.hist.miss
  }
  
  return(out)
}

partisan_phrase_scatterplot <- function(df, 
                                        lbl.col, 
                                        x.col, 
                                        x.lab = NA,
                                        x.lim = NA,
                                        y.col,
                                        y.lab = NA,
                                        y.lim = NA,
                                        caption = NA,
                                        color.col = NA, 
                                        size.col = NA, 
                                        max.overlaps = 10, 
                                        text.size = 4,
                                        point.alpha = 0.5,
                                        flag.size = .02,
                                        flag.thin = TRUE,
                                        symmetric = TRUE,
                                        annot.top.left = NA,
                                        annot.bottom.left = NA,
                                        annot.top.right = NA,
                                        annot.bottom.right = NA,
                                        ...) {
  df$l <- df[[lbl.col]]
  df$x <- df[[x.col]]
  df$y <- df[[y.col]]
  if (!is.na(color.col)) {
    df$c <- df[[color.col]]
    c.neg <- names(sort(-table(d.text$color[d.text$chi2 < 0])))
    c.pos <- names(sort(-table(d.text$color[d.text$chi2 > 0])))
  } else {
    df$c <- "black"
    c.neg <- "black"
    c.pos <- "black"
  }
  if (!is.na(size.col)) {
    df$s <- df[[size.col]]
  } else {
    df$s <- NA
  }
  df <- filter(df, !is.na(x), !is.na(y))
  x.mean <- weighted.mean(x = d.text$chi2, w = d.text$n_brands)
  x.mean.c <- ifelse(x.mean < 0, c.neg, ifelse(x.mean == 0, "black", c.pos))
  
  p <- df %>%
    mutate(l = ifelse(grepl("🇺🇸", l), "", l)) %>%
    ggplot(aes(x=x, y=y))
  
  p <- p + geom_vline(xintercept = x.mean, lty = 2, color = x.mean.c, alpha = 0.8)
  
  p <- p + geom_text_repel(aes(label=l, colour=c), size=text.size, max.overlaps=max.overlaps, ...)
  
  if (!is.na(size.col)) {
    p <- p + geom_point(aes(colour=c, size=s), shape = 1, alpha=point.alpha)
  } else {
    p <- p + geom_point(shape = 1, alpha=point.alpha)
  }
  if (!(is.na(as.character(x.lab)))) { 
    p <- p + xlab(x.lab) 
  }
  if (!(is.na(as.character(y.lab)))) { 
    p <- p + ylab(y.lab) 
  }
  if (any(!is.na(x.lim))) { 
    p <- p + xlim(x.lim) 
  }
  if (symmetric) {
    x.range <- c(-max(abs(range(d.text$chi2))), 
                 max(abs(range(d.text$chi2))))
    p <- p + xlim(x.range)
  }
  
  if (!any(is.na(y.lim))) { 
    p <- p + ylim(y.lim) 
  }
  if (!is.na(as.character(caption))) {
    p <- p + labs(caption = caption) 
  }
  
  p <- p +
    scale_color_identity() +
    theme(legend.position = "none",
          plot.caption = element_text(hjust = 0.5))
  
  if (any(grepl("🇺🇸", df$l))) {
    p <- p +
      geom_image(data = df %>%
                   # filter(grepl("🇺🇸", l)) %>%
                   filter(grepl("🇺🇸", l)) %>%
                   arrange(-y) %>%
                   head(1) %>% ### only plot flag phrase with highest y value
                   mutate(img = ifelse(flag.thin, EMOJI_FLAG_FPATH, EMOJI_FLAG_THIN_FPATH)),
                 aes(image = img), size = flag.size) # +
    # geom_text(data = df %>%
    #             filter(grepl("🇺🇸", l)) %>%
    #             mutate(l = gsub("🇺🇸", "", l)),
    #           aes(label=l, colour=c), nudge_x=-2*flag.size, size=text.size, max.overlaps=max.overlaps, hjust=1)
  }
  
  annot <- data.frame(
    #left-bottom, left-top, right-bottom, right-top
    xpos = c(-Inf, -Inf, Inf, Inf), 
    ypos =  c(-Inf, Inf, -Inf, Inf), 
    hjustvar = c(-.25,   #shifts bottom left text to the right; make more negative to move it further right
                 -.25,   #shifts top left text to the right; make more negative to move it further right
                 1.25,   #shifts bottom right text to the left; make more positive to move it further left
                 1.25),  #shifts top right text to the left; make more positive to move it further left
    vjustvar = c(-1,    #shifts bottom left text upward; make more negative to move it further up
                 2,     #shifts top left text downward; make more positive to move it further down
                 -1,    #shifts bottom right text upward; make more negative to move it further up
                 2)     #shifts top right text downward; make more positive to move it further down
  )
  
  if (!is.na(as.character(annot.bottom.left))) {
    p <- p + annotate("text", x=annot$xpos[1], y=annot$ypos[1], hjust=annot$hjustvar[1], vjust=annot$vjustvar[1], label=annot.bottom.left, fontface="bold")
  }
  
  if (!is.na(as.character(annot.top.left))) {
    p <- p + annotate("text", x=annot$xpos[2], y=annot$ypos[2], hjust=annot$hjustvar[2], vjust=annot$vjustvar[2], label=annot.top.left, fontface="bold")
  }
  
  if (!is.na(as.character(annot.bottom.right))) {
    p <- p + annotate("text", x=annot$xpos[3], y=annot$ypos[3], hjust=annot$hjustvar[3], vjust=annot$vjustvar[3], label=annot.bottom.right, fontface="bold")
  }
  
  if (!is.na(as.character(annot.top.right))) {
    p <- p + annotate("text", x=annot$xpos[4], y=annot$ypos[4], hjust=annot$hjustvar[4], vjust=annot$vjustvar[4], label=annot.top.right, fontface="bold")
  }
  
  return(p)
}

## ++ Text -------------------------------------------------

categorize_var <- function(x, multi_line=T) {
  y <- x
  y[grepl("(hrc|gd|glassdoor)",y)] <- "Workplace\nEnvironment"
  y[grepl("(rev|empl)",y)] <- "Firm"
  y[grepl("(R_don_|FEC|zippia)",y)] <- "Employees"
  y[grepl("cong_",y)] <- "HQ Representatives"
  y[grepl("pres\\.REP",y)] <- "Consumers,\nEmployees,\nProximal Voters"
  y[grepl("(twit|^sl)",y)] <- "Consumers"
  y[grepl("(twit|^sl)",y)] <- "Consumers"
  y[grepl("yougov",y)] <- "Consumers"
  y[grepl("stkhl.R",y)] <- "All"
  y[grepl("(legis|opsec)",y)] <- "Political\nActivities"
  y[grepl("(hrc|gd)",y)] <- "Workplace\nEnvironment"
  y[grepl("(gjf)",y)] <- "Regulatory\nCompliance"
  y[grepl("(cdp|clm100)",y)] <- "Climate\nPolicy"

  if (!multi_line) {
    y <- gsub("\n", " ", y)
    y <- gsub("  ", " ", y)
  }
  
  return(y)  
}

sanitize_var <- function(x, add_category=T, multi_line=F) {
  y <- x
  var_to_lbl <- list(
    "hq_main_in_US" = "Primary HQ in U.S.",
    "R_don_share" = "% R. Donations:\nAll",
    "R_don_share.Board_Member" = "% R. Donations:\nBoard Members",
    "R_don_share.Managers" = "% R. Donations:\nManagers",
    "R_don_share.Legal" = "% R. Donations:\nLegal",
    "R_don_share.Human_Resources" = "% R. Donations:\nHR",
    "R_don_share.Top_Exec" = "% R Donations:\nExecutives",
    "R_don_share.Public_Relations" = "% R. Donations:\nPR",
    "R_don_share.Marketing" = "% R Donations:\nMarketing",
    "R_don_share.Rank_and_File" = "% R. Donations:\nRank-and-file",
    "twitter.foll_ideo_slant" = "Twitter: R. Direction of\nFollowers",
    "sl.Rep_Pct" = "Twitter: % R. Followers",
    "sl.Rep_Pct.2017_02" = "Twitter: % R.\nFollowers (2017)",
    "sl.Rep_Pct.2022_10" = "Twitter: % R.\nFollowers (2022)",
    "hq_pres.REP" = "% R. Pres. Vote:\nHQ",
    "zi.pres.REP" = "% R. Pres. Vote:\nLocations (Zippia)",
    "sg.pres.REP" = "% R. Pres. Vote:\nLocations (SafeGraph)",
    "cong_house_dw_mean" = "R. Direction of\nHQ House Rep.",
    "cong_sen_dw_mean" = "R. Direction of\nHQ Senator",
    "stkhl.R" = "% R. Stakeholders",
    "legis.R_frac" = "% R. Legislators Lobbied",
    "yougov_aud.gender.Female" = "Consumers: % Female Consumers",
    "yougov_aud.gender.Male" = "Consumers: % Male Consumers",
    # "yougov_aud.age.18-24" = "Consumers: % Consumers 18-24 y.o.",
    # "yougov_aud.age.25-34" = "Consumers: % Consumers 25-34 y.o.",
    # "yougov_aud.age.35-44" = "Consumers: % Consumers 35-54 y.o.",
    # "yougov_aud.age.45-54" = "Consumers: % Consumers 45-54 y.o.",
    # "yougov_aud.age.55+" = "Consumers: % Consumers 55+ y.o.",
    # "yougov_aud.educ.2-year" = "Consumers: % Consumers 2-year degrees",
    # "yougov_aud.educ.4-year" = "Consumers: % Consumers 4-year degrees",
    # "yougov_aud.educ.High_school_graduate" = "Consumers: % Consumers high "
    # "yougov_aud.educ.No_HS"
    # "yougov_aud.educ.Post-grad"
    # "yougov_aud.educ.Some_college"
    # "yougov_aud.empl.Other"
    # "yougov_aud.empl.Permanently_disabled"
    # "yougov_aud.empl.Prefer_not_to_say"
    # "yougov_aud.empl.Retired"
    # "yougov_aud.empl.Student"
    # "yougov_aud.empl.Taking_care_of_home_or_family"
    # "yougov_aud.empl.Temporarily_unemployed_(i.e._between_jobs)"
    # "yougov_aud.empl.Unemployed"
    # "yougov_aud.empl.Working_full_time"
    # "yougov_aud.empl.Working_part_time"
    # "yougov_aud.Income.<$40k"
    # "yougov_aud.Income.>$200k"
    # "yougov_aud.Income.$40-80k"
    # "yougov_aud.Income.$80-200k"
    # "yougov_aud.region.Midwest"
    # "yougov_aud.region.Northeast"
    # "yougov_aud.region.South"
    # "yougov_aud.region.West"
    "yougov_pct_neg_opinion" = "% Negative Opinion",
    "yougov_pct_net_opinion" = "Net Opinion",
    "yougov_pct_pos_opinion" = "% Positive Opinion",
    "yougov_pct_recognition" = "% Recognition",
    "num_empl.final.log" = "Firm: log(Number of Employees)",
    "rev_mil.final.log" = "Firm: log(Revenue $1M)",
    # "zippia_empl.Ages.18-20"
    # "zippia_empl.Ages.20-30"
    # "zippia_empl.Ages.30-40"
    # "zippia_empl.Ages.40+"
    # "zippia_empl.Ages.Less than 18"
    # "zippia_empl.Degrees.Associate"
    # "zippia_empl.Degrees.Bachelors"
    # "zippia_empl.Degrees.Certificate"
    # "zippia_empl.Degrees.Diploma"
    # "zippia_empl.Degrees.Doctorate"
    # "zippia_empl.Degrees.High School Diploma"
    # "zippia_empl.Degrees.License"
    # "zippia_empl.Degrees.Masters"
    # "zippia_empl.Ethnicity.Asian"
    "zippia_empl.Ethnicity.White"= "Employees:\n% White Employees",
    "zippia_empl.Genders.Female" = "Employees:\n% Female Employees",
    "zippia_empl.Genders.Male" = "Employees\n % Male Employees",
    "zippia_empl.Ethnicity.White"= "Employees:\n% White Employees",
    "zippia_empl.Ethnicity.Non-White"= "Employees:\n% Non-White Employees",
    "zippia_empl.Ethnicity.Black Or African American"= "Employees:\n% Black Employees",
    "hrc_rating" = "Workplace Rating:\nLGBTQ+ Equality Score\n(HRC)",
    "glassdoor_Rating.Diversity and Inclusion" = "Workplace Rating:\nDiversity and Inclusion\n(Glassdoor)",
    "glassdoor_review_avg" = "Workplace Rating:\nAverage Rating\n(Glassdoor)",
    "gd.Gender.Women.avg_rating" = "Workplace Rating:\nWomen Employee Satisfaction\n(Glassdoor)",
    "gd.Gender.Trasngender and/or Non-Binary.avg_rating" = "Workplace Rating:\nTransgender/Non-Binary Employee Satisfaction\n(Glassdoor)",
    "gd.Sexual Orientation.LGBTQ+.avg_rating" = "Workplace Rating:\nLGBTQ+ Employee Satisfaction\n(Glassdoor)",
    "gd.Race / Ethnicity.Black or African American.avg_rating" = "Workplace Rating:\nBlack Employee Satisfaction\n(Glassdoor)",
    
    "cdp_avg_score" = "Climate: Disclosure and\nAction Score (CDP)",
    "gjf.n_off.discr" = "Compliance: Number of\nDiscrimination Offenses",
    "gjf.n_off.labor" = "Compliance: Number of\nLabor Violations",
    "gjf.n_off.environ" = "Compliance: Number of\nEnvironmental Violations",
    "opsec.R_cand_share.org_dollars" = "% PAC $ on R. Candidates",
    "opsec.R_share.org_dollars" = "% PAC $ on R. Groups",
    "clm100_policy.Organisation Score.March 2022" = "Climate: Organization Score\n(Climate Action 100+)",
    "tw_count" = "Number of Tweets",
    "tw_followers" = "Number of Twitter Followers"
  )
  for (i in names(var_to_lbl)) {
    y[y == i] <- var_to_lbl[[i]]
  }
  
  clm100_rgx <- list(
    "clm100.*5.1.*\\.a" = "Climate: Discloses Decarbonisation Strategy",
    "clm100.*5.1.*\\.b" = "Climate: Quantifies Decarbonisation Outcomes",
    "clm100.*5.2.*\\.a" = "Climate: Generates 'Green Revenue'",
    "clm100.*5.2.*\\.b" = "Climate: Targets for 'Green Revenue'",    
    "clm100.*6.1.*\\.a" = "Climate: Decarbonising CapEx",
    "clm100.*6.1.*\\.b" = "Climate: Decarbonising CapEx\nAligned with Paris Accord",
    "clm100.*6.2.*\\.a" = "Climate: Discloses Paris Accord\nAlignment Methodology",
    "clm100.*6.2.*\\.b" = "Climate: Quantifies Carbon\nEmissions Outcomes",
    "clm100.*7.1.*\\.a" = "Climate: Lobbying Aligned\nwith Paris Accord",
    "clm100.*7.1.*\\.b" = "Climate: Discloses Climate Lobbying",
    "clm100.*7.2.*\\.a" = "Climate: Trade Assoc. Lobbying\nAligned with Paris Accord",
    "clm100.*7.2.*\\.b" = "Climate: Discloses Trade Assocs",
    "clm100.*7.3.*\\.a" = "Climate: Reviews Trade Assoc.\nClimate Positions",
    "clm100.*7.3.*\\.b" = "Climate: Responds to Trade Assoc.\nClimate Positions",
    "clm100.*8.1.*\\.a" = "Climate: Board Discloses Climate\nRisk Management",
    "clm100.*8.1.*\\.b" = "Climate: Board Member Responsible\nfor Climate Change",
    "clm100.*8.2" = "Climate: Exec. Pay Reflects\nClimate Change Goals",
    "clm100.*8.3.*Mar.*\\a" = "Climate: Board Members Evaluated\nfor Climate Risk Management",
    "clm100.*8.3.*Mar.*\\b" = "Climate: Board Members Transparently\nEvaluated for Climate Risk Management"
  )
  for (i in names(clm100_rgx)) {
    y[grepl(i, y)] <- clm100_rgx[[i]]
  }
  
  if (!add_category) {
    y <- gsub(".*\\:", "", y)
    y <- gsub("  ", " ", y)
  }
    
  if (!multi_line) {
    y <- gsub("\n", " ", y)
    y <- gsub("  ", " ", y)
  }
  
  return(y)
}

destem_text <- function(x) {
  x <- gsub("_"," ",x)
  x <- gsub("abov","above",x,ignore.case=T)
  x <- gsub("vaccin","vaccine",x,ignore.case=T)
  x <- gsub("mandat","mandate",x,ignore.case=T)
  x <- gsub("domest","domestic",x,ignore.case=T)
  x <- gsub("communiti","community",x,ignore.case=T)
  x <- gsub("black matern","black maternal",x,ignore.case=T)
  x <- gsub("war.*drug","war on drugs",x,ignore.case=T)
  x <- gsub("desanti","desantis",x,ignore.case=T)
  x <- gsub("illeg","illegal",x,ignore.case=T)  
  x <- gsub("coverag","coverage",x,ignore.case=T)  
  x <- gsub("reproduct","reproductive",x,ignore.case=T) 
  x <- gsub("enemi","enemy",x,ignore.case=T) 
  x <- gsub("gun viol.*","gun violence",x,ignore.case=T) 
  x <- gsub("safeti","safety",x,ignore.case=T) 
  x <- gsub("crisi","crisis",x,ignore.case=T) 
  x <- gsub("immigr","immigrant",x,ignore.case=T) 
  x <- gsub("abort","abortion",x,ignore.case=T) 
  x <- gsub("issu","issue",x,ignore.case=T)
  x <- gsub("suppli chain","supply chain",x,ignore.case=T)
  x <- gsub("infrastructur","infrastructure",x,ignore.case=T)
  x <- gsub("ensur","ensure",x,ignore.case=T)
  x <- gsub("enforc offic","enforcement officer",x,ignore.case=T)
  x <- gsub("law enforc","law enforcement",x,ignore.case=T)
  x <- gsub("parti","party",x,ignore.case=T)
  x <- gsub("justic","justice",x,ignore.case=T)
  x <- gsub("armed.*forc","armed forces",x,ignore.case=T)
  x <- gsub("women color","women of color",x,ignore.case=T)
  x <- gsub("climat","climate",x,ignore.case=T)
  x <- gsub("chang","change",x,ignore.case=T)
  x <- gsub("famili","families",x,ignore.case=T)
  x <- gsub("histori","history",x,ignore.case=T)
  x <- gsub("lewi","lewis",x,ignore.case=T)
  x <- gsub("merri","merry",x,ignore.case=T)
  x <- gsub("stand solidar","stand in solidarity",x,ignore.case=T)
  x <- gsub("natur gas","natural gas",x,ignore.case=T)
  x <- gsub("nation secur","national security",x,ignore.case=T)
  x <- gsub("defund polic","defund the police",x,ignore.case=T)
  x <- gsub("found father","founding father",x,ignore.case=T)
  x <- gsub("found father","founding father",x,ignore.case=T)
  x <- gsub("everyon deserv","everyone deserves",x,ignore.case=T)
  x <- gsub("air forc","air force",x,ignore.case=T)
  x <- gsub("christma","christmas",x,ignore.case=T)
  x <- gsub("mani","many",x,ignore.case=T)
  x <- gsub("vot.*right","voting rights",x,ignore.case=T)
  x <- gsub("militari","military",x,ignore.case=T)
  x <- gsub("servic","service",x,ignore.case=T)
  x <- gsub("prescript","prescription",x,ignore.case=T)
  x <- gsub("voic","voice",x,ignore.case=T)
  x <- gsub("georg","george",x,ignore.case=T)
  x <- gsub("busi","business",x,ignore.case=T)
  x <- gsub("peopl","people",x,ignore.case=T)
  x <- gsub("suprem","supreme",x,ignore.case=T)
  x <- gsub("rescu","rescue",x,ignore.case=T)
  x <- gsub("afford","affordable",x,ignore.case=T)
  x <- gsub("administr","administration",x,ignore.case=T)
  x <- gsub("pandem","pandemic",x,ignore.case=T)
  x <- gsub("qualiti","quality",x,ignore.case=T)
  x <- gsub("heritag","heritage",x,ignore.case=T)
  x <- gsub("distanc","distance",x,ignore.case=T)
  x <- gsub("save live","save lives",x,ignore.case=T)
  x <- gsub("presid george","president bush",x,ignore.case=T)
  x <- gsub("women serv","women in service",x,ignore.case=T)
  x <- gsub("black live","black lives matter",x,ignore.case=T)
  return(x)
}

get_hashtags <- function(txt) {
  lst <- str_extract_all(txt, "\\#[^ \n@\\:]+ ")
  return(lst)
}

clean_social_media_txt <- function(txt) {
  txt <- gsub("(@\\w+ @\\w+ \\.|@\\w+ |@\\w+\\:|\\#\\w+ |\\#\\w+$| \\w+\\.com|^RT|https|http|\\.com|www|link)", "", txt, ignore.case=T)
  txt <- gsub("#[A-Za-z0-9]+|@[A-Za-z0-9]+|\\w+(?:\\.\\w+)*/\\S+", "", txt, ignore.case=T)
  
  extra_stopwords <- c(
    "just","link","bio","http","https","we're","watch","download","save","facebook","youtube",
    "press","click","make","makes","video","good","luck","bad","chance","featured","share",
    "stay","instagram","spotify","show","listen","work","said","might","women","black","asian",
    "people of color","summer","conference","event","photo","photos","ready","best","next",
    "first","weekend","like","love","big","small","film","y'all","favorite","post","help",
    "original","visit","enjoy","thanks","thank","one","two","three","four","five","learn",
    "check","year","happy","life","take","season","today","tonight","love","y'all","places",
    "safer","kids","children","million","millions"
  )
  # extra_stopwords <- c(extra_stopwords, unlist(str_split(unlist(TEXT_RGXS), pattern="(\\| |\\(|\\)|\\||\\.\\*)")))
  extra_stopwords <- trimws(extra_stopwords)
  extra_stopwords <- extra_stopwords[extra_stopwords != ""]
  
  stopwords_rgx <- paste0("(",paste0(" ",stopwords("en")," ", collapse="|"),")")
  txt <- gsub(stopwords_rgx, " ", txt, ignore.case=T)
  
  return(txt)
}

append_rgx <- function(df, rgx, rgx_name, text_col="text") {
  rgx_match <- grepl(rgx, df[[text_col]], ignore.case=T)
  df[[paste0(text_col,".",rgx_name)]] <- rgx_match
  return(df)
}

explore_rgx <- function(df, text_col="text", aux_col=NA, rgx, random=T, head=10) {
  if (random) { df <- df[sample(1:nrow(df)),] }   
  rgx_match <- grepl(rgx, df[[text_col]], ignore.case=T)
  if (is.na(aux_col)) {
    d <- df[rgx_match, text_col]
  } else {
    d <- df[rgx_match, c(aux_col, text_col)]
  }
  head(d, head)
}

append_support_phrases <- function(df, text_col="text") {
  ## statement [y/n]
  df[[paste0(text_col,".support")]] <- grepl(SUPPORT_RGX, 
                                             df[[text_col]], ignore.case=T) & 
    !(grepl("(support were|don't stand|won't stand|won't support|don't support)", 
            df[[text_col]], ignore.case=T))
  ## phrase
  df[[paste0(text_col,".support_phrase")]] <- NA
  df[[paste0(text_col,".support_phrase")]][df[[paste0(text_col,".support")]]] <- str_extract(
    tolower(df[df[[paste0(text_col,".support")]], text_col]), 
    paste0(SUPPORT_RGX, "( |\\w|\\-|\\@|\\d)*(\\.|\\;|\\n)?")
  )
  
  return(df)
}

append_oppose_phrases <- function(df, text_col="text") {
  ## statement [y/n]
  df[[paste0(text_col,".oppose")]] <- grepl(OPPOSE_RGX, 
                                            df[[text_col]], ignore.case=T)
  
  ## phrase
  df[[paste0(text_col,".oppose_phrase")]] <- NA
  df[[paste0(text_col,".oppose_phrase")]][df[[paste0(text_col,".oppose")]]] <- str_extract(
    tolower(df[df[[paste0(text_col,".oppose")]], text_col]), 
    paste0(OPPOSE_RGX, "( |\\w|\\-|\\@|\\d)*(\\.|\\;|\\n|not)?")
  )
  df[[paste0(text_col,".oppose_phrase")]][df[[paste0(text_col,".oppose")]]] <- gsub(
    paste0(SUPPORT_RGX, "( |\\w|\\-|\\@|\\d)*(\\.|\\;|\\n|not)?"), "",
    df[[paste0(text_col,".oppose_phrase")]][df[[paste0(text_col,".oppose")]]]
  )
  
  return(df)
}


make_quanteda_objs <- function(
    df,
    text_col="text", 
    id_col="id", 
    groups_col=NA,
    tolower=T, 
    stem=T, 
    min_nchar=NA, 
    remove_stopwords=T, 
    extra_stopwords=c(),
    ngrams=2:3,
    min_termq=NA, 
    max_termq=NA,
    return_toks=FALSE,
    return_corpus=FALSE
) {
  
  pbar <- utils::txtProgressBar(min=0, max=100, style=3)
  
  if (length(extra_stopwords) != 0) {
    df[[text_col]] <- gsub(paste0("(",paste0(extra_stopwords,collapse="|"),")")," ",df[[text_col]])
  }
  
  corpus <- corpus(df, text_field = text_col, docid_field = id_col)
  utils::setTxtProgressBar(pbar, 25)
  
  toks <- tokens(corpus, 
                 remove_punct = TRUE, 
                 remove_symbols = TRUE, 
                 remove_numbers = TRUE,
                 remove_url = TRUE) %T>% 
    { .; utils::setTxtProgressBar(pbar, 30) } %>% {
      if (tolower) tokens_tolower(.)
      else .
    } %T>% { .; utils::setTxtProgressBar(pbar, 35) } %>% {
      if (stem) tokens_wordstem(.)
      else .
    } %T>% { .; utils::setTxtProgressBar(pbar, 40) } %>% {
      if (!is.na(min_nchar)) tokens_select(., min_nchar=min_nchar)
      else .
    } %T>% { .; utils::setTxtProgressBar(pbar, 45) } %>% {
      if (remove_stopwords) tokens_remove(., pattern=c(stopwords("en")))
      else .
    } %T>% 
    { .; utils::setTxtProgressBar(pbar, 50) } %>%
    tokens_ngrams(n=ngrams)
  
  dfm <- dfm(toks)
  utils::setTxtProgressBar(pbar, 55)
  
  utils::setTxtProgressBar(pbar, 60)
  if (!is.na(groups_col)) {
    dfm <- dfm_group(dfm, groups=df[[groups_col]])%T>%
      { .; utils::setTxtProgressBar(pbar, 70) } %>%
      dfm_trim(min_docfreq = 2,
               docfreq_type = "count")
  } else {
    dfm <- dfm %T>%
      { .; utils::setTxtProgressBar(pbar, 70) } %>%
      dfm_trim(min_docfreq = 2,
               docfreq_type = "count")            
  }
  if (!is.na(max_termq) & !is.na(min_termq)) {
    dfm <- dfm %T>% 
      { .; utils::setTxtProgressBar(pbar, 80) } %>%
      dfm_trim(min_termfreq = min_termq, 
               max_termfreq = max_termq,
               termfreq_type = "quantile")
  }
  utils::setTxtProgressBar(pbar, 100)
  
  out <- list()
  out$dfm <-dfm
  if (return_toks) out$toks <- toks
  if (return_corpus) out$corpus <- corpus
  
  return(out)
  
}

## ++ Stats ------------------------------------------------


analyze_missingness <- function(data,
                                lm.specifs,
                                p.adjust = "BHq",
                                p.adjust.alpha = GLOBAL_ALPHA_THRESHOLD,
                                p.adjust.group.vars = c("y"),
                                debug = T,
                                parallelize = T,
                                parallel.cores = T) {
  tidy <- broom::tidy
  coeftest <- lmtest::coeftest
  
    pb_lapply <- function(X, fn) {
      out <- pblapply(X = X, fn)
      return(out)
    }
  
  
  out.list <- pb_lapply(1:nrow(lm.specifs), function(s){
    if (debug) message(s)
    
    specif <- lm.specifs[s,]
    
    if (debug) print(specif)
    
    data.s <- data
    
    data.s$y <- scale(as.numeric(data.s[[specif$y]]))
    data.s$x <- scale(as.numeric(data.s[[specif$x]]))
    
    data.s$y.missing <- is.na(data.s[[specif$y]])
    data.s$x.missing <- is.na(data.s[[specif$x]])
    
    if (all(is.na(data.s$x))) stop(sprintf("x value `%s` not found in data", specif$x))
    if (all(is.na(data.s$y))) stop(sprintf("y value `%s` not found in data", specif$y))
    
    if (length(unique(data.s$x)) < 2) {
      if (debug) print("not unique `x` values")
      return(specif)
    }
    if (length(unique(data.s$y)) < 2) {
      if (debug) print("not enough unique `y` values")
      return(specif)
    }
    if (nrow(data.s) < 30) {
      if (debug) print("not enough observations")
      return(specif)
    }
    
    ## ++ distribution of y for x-missing vs. not
    fit <- lm_robust(y ~ x.missing, data = data.s)
    tidy.fit <- tidy(fit)
    est <- tidy.fit[[2,2]]
    se <- tidy.fit[[2,3]]
    t.stat <- tidy.fit[[2,4]]
    p.val <- tidy.fit[[2,5]]
    
    specif$y.x.missing.est <- est
    specif$y.x.missing.tstat <- t.stat
    specif$y.x.missing.pval <- p.val
    specif$y.x.missing.std.err <- se
    
    ## ++ distribution of x for y-missing vs. not
    fit <- lm_robust(x ~ y.missing, data = data.s)
    tidy.fit <- tidy(fit)
    est <- tidy.fit[[2,2]]
    se <- tidy.fit[[2,3]]
    t.stat <- tidy.fit[[2,4]]
    p.val <- tidy.fit[[2,5]]
   
    specif$x.y.missing.est <- est
    specif$x.y.missing.tstat <- t.stat
    specif$x.y.missing.pval <- p.val    
    specif$x.y.missing.std.err <- se
    
    return(specif)
  })
  out.df <- bind_rows(out.list)
  
  
  ## ++ adjust p-values
  if (any(!is.na(out.df$y.x.missing.pval))) {
    message("Adjusting p-values")
    
    out.df <- out.df %>%
      group_by_at(p.adjust.group.vars) %>%
      arrange(x.y.missing.pval) %>%
      mutate(k = n(), ## number of hypotheses
             r = 1:n(), ## rank of p-values
             x.y.missing.p.value.adj.alpha = (r*p.adjust.alpha)/k, ## stepped-up thresholds
             x.y.missing.p.value.adj.sig = x.y.missing.pval < x.y.missing.p.value.adj.alpha, ## stepped-up hypothesis tests
             x.y.missing.p.value.adj.zcrit = qnorm(1 - (x.y.missing.p.value.adj.alpha)/2), ## new critical values for asymptotic CIs
             x.y.missing.p.value.adj.tcrit = qt(1 - (x.y.missing.p.value.adj.alpha)/2, nrow(data)-1) ## new critical values for small-sample CIs
      )
  }
  
  if (any(!is.na(out.df$x.y.missing.pval))) {
    message("Adjusting p-values")
    
    out.df <- out.df %>%
      group_by_at(p.adjust.group.vars) %>%
      arrange(y.x.missing.pval) %>%
      mutate(k = n(), ## number of hypotheses
             r = 1:n(), ## rank of p-values
             y.x.missing.p.value.adj.alpha = (r*p.adjust.alpha)/k, ## stepped-up thresholds
             y.x.missing.p.value.adj.sig = y.x.missing.pval < y.x.missing.p.value.adj.alpha, ## stepped-up hypothesis tests
             y.x.missing.p.value.adj.zcrit = qnorm(1 - (y.x.missing.p.value.adj.alpha)/2), ## new critical values for asymptotic CIs
             y.x.missing.p.value.adj.tcrit = qt(1 - (y.x.missing.p.value.adj.alpha)/2, nrow(data)-1) ## new critical values for small-sample CIs
      )
  }  
  
  return(out.df)  
  
}

measure_bootstrapped_alignments_robustly <- function(data.boots.x,
                                                     lm.specifs,
                                                     lm.func,
                                                     p.adjust = "BHq",
                                                     p.adjust.group.vars = c("y"),
                                                     p.adjust.alpha = GLOBAL_ALPHA_THRESHOLD,
                                                     quad.stats = TRUE,
                                                     parallelize = TRUE,
                                                     parallel.cores = 8,
                                                     ...) {
  ##
  ## `data.boots.x` must be indexed by `boot`.
  ##
  select <- dplyr::select
  n <- dplyr::n

  boots <- na.omit(unique(data.boots.x$boot))
  
  if (parallelize) {
    pb_lapply <- function(fn) {
      message("making bootstrapping cluster")
      nc <- parallel.cores
      cl <- parallel::makeCluster(nc, outfile = "")
      doParallel::registerDoParallel(cl)
      on.exit(parallel::stopCluster(cl), add = TRUE)
      parallel::clusterExport(cl = cl, 
                              envir = environment(),
                              varlist = c("data.boots.x","tibble","tidy","nobs","coeftest","vcovCL","parallel.cores","parallelize",
                                          "mutate","do","filter","rename_all","sample_frac","pblapply","boots",
                                          "chisq.test","binom.test","zaminfl_summary","bind_rows",
                                          "bind_cols","%>%","lm.specifs","lm.func","p.adjust",
                                          "p.adjust.group.vars","p.adjust.alpha","p.adjust.alpha",
                                          "ComputeModelInfluence","AppendTargetRegressorInfluence",
                                          "GetInferenceSignals","PredictForSignals","RerunForSignals",
                                          "GetSignalsAndRerunsDataframe","pivot_wider",
                                          "arrange_at","relocate","select","tost",
                                          "group_by_at","arrange","starts_with","n",
                                          # "torch_tensor","torch_double","torch_matmul","torch_sum","torch_inverse","torch_sqrt",
                                          "quad.stats","debug","measure_alignments_robustly"))
      out <- pblapply(cl = cl, X = boots, fn)
      return(out)
    }
  } else {
    pb_lapply <- function(fn) {
      message("performing bootstrap in sequence")
      out <- pblapply(X = boots, fn)
      return(out)
    }
  }
  
  out.list <- pb_lapply(function(b){
    b.out.df <- measure_alignments_robustly(data = subset(data.boots.x, boot == b),
                                            lm.specifs = lm.specifs,
                                            lm.func = lm.func,
                                            p.adjust = p.adjust,
                                            p.adjust.group.vars = p.adjust.group.vars,
                                            p.adjust.alpha = p.adjust.alpha,
                                            quad.stats = quad.stats,
                                            parallelize = ifelse(parallelize,F,T),
                                            parallel.cores = ifelse(parallelize,F,16),
                                            zaminfl = FALSE,
                                            ...)
    b.out.df$boot <- b
    return(b.out.df)
  })
  out.df <- bind_rows(out.list)
  return(out.df)
}

measure_alignments_robustly <- function(data,
                                        lm.specifs,
                                        lm.func,
                                        p.adjust = "BHq",
                                        p.adjust.group.vars = c("y","subset.var.val"),
                                        p.adjust.alpha = GLOBAL_ALPHA_THRESHOLD,
                                        quad.stats = TRUE,
                                        debug = FALSE,
                                        parallelize = TRUE,
                                        parallel.cores = 8,
                                        equiv.test = FALSE,
                                        zaminfl = TRUE,
                                        # sanitize.x = TRUE,
                                        ...) {
  t.0 <- Sys.time()
  tidy <- broom::tidy
  coeftest <- lmtest::coeftest
  if (parallelize) {
    pb_lapply <- function(X, fn) {
      nc <- parallel.cores
      cl <- parallel::makeCluster(nc, outfile = "")
      doParallel::registerDoParallel(cl)
      on.exit(parallel::stopCluster(cl), add = TRUE)
      zaminfl_envir <- environment(zaminfl_summary)
      message("making cluster")
      parallel::clusterExport(cl = cl, 
                              envir = environment(),
                              varlist = c("data","tibble","tidy","nobs","coeftest","vcovCL","parallelize",
                                          "mutate","do","filter","rename_all","sample_frac",
                                          "chisq.test","binom.test","zaminfl_summary","bind_rows",
                                          "bind_cols","%>%","lm.specifs","lm.func","p.adjust",
                                          "p.adjust.group.vars","p.adjust.alpha","p.adjust.alpha",
                                          "ComputeModelInfluence","AppendTargetRegressorInfluence",
                                          "GetInferenceSignals","PredictForSignals","RerunForSignals",
                                          "GetSignalsAndRerunsDataframe","pivot_wider","tost",
                                          # "torch_tensor","torch_double","torch_matmul","torch_sum","torch_inverse","torch_sqrt",
                                          "quad.stats","debug"))      
      message("running job")
      out <- pblapply(cl = cl, X = X, fn)
      # parallel::stopCluster(cl)
      return(out)
    }
  } else {
    pb_lapply <- function(X, fn) {
      out <- pblapply(X = X, fn)
      return(out)
    }
  }
  # if (sanitize.x) {
  #   lm.specifs$x.name <- sanitize_var(lm.specifs$x)
  # }
  
  out.list <- pb_lapply(1:nrow(lm.specifs), function(s){
    if (debug) message(s)
    
    specif <- lm.specifs[s,]
    
    if (debug) print(specif)
    
    data.s <- data
    if (specif$bin.x.missing) data.s[[specif$x]] <- as.numeric(!is.na(data.s[[specif$x]]))
    if (specif$bin.y.missing) data.s[[specif$y]] <- as.numeric(!is.na(data.s[[specif$y]]))
    
    if (specif$sign.x & is.na(specif$x.mid)) data.s[[specif$x]] <- sign(as.numeric(data.s[[specif$x]]))
    if (specif$sign.x & !is.na(specif$x.mid)) data.s[[specif$x]] <- as.numeric(data.s[[specif$x]]) > specif$x.mid
    if (specif$sign.y & is.na(specif$y.mid)) data.s[[specif$y]] <- sign(as.numeric(data.s[[specif$y]]))
    if (specif$sign.y & !is.na(specif$y.mid)) data.s[[specif$y]] <- as.numeric(data.s[[specif$y]]) > specif$y.mid
    
    data.s[[specif$x]] <- as.numeric(data.s[[specif$x]])
    data.s[[specif$y]] <- as.numeric(data.s[[specif$y]])
    data.s$y <- data.s[[specif$y]]
    data.s$x <- data.s[[specif$x]]

    if (all(is.na(data.s$x))) stop(sprintf("x value `%s` not found in data", specif$x))
    if (all(is.na(data.s$y))) stop(sprintf("y value `%s` not found in data", specif$y))
    
    if (!is.na(specif$w) & specif$w != "") {
      data.s$w <- data.s[[specif$w]]
    } else {
      data.s$w <- 1
    }
    if (!is.na(specif$se.clus) & specif$se.clus != "") {
      data.s$se.clus <- data.s[[specif$se.clus]]
    } else {
      data.s$se.clus <- NA
    }
    
    data.s <- data.s[!is.na(data.s$y),]
    data.s <- data.s[!is.na(data.s$x),]
    if (!is.na(specif$se.clus) & specif$se.clus != "") {
      data.s <- data.s[!is.na(data.s$se.clus),]
    }
    if (!is.na(specif$w) & specif$w != "") {
      data.s <- data.s[!is.na(data.s$w),]
    }
    if (!is.na(specif$subset.var) & specif$subset.var != "" &
        !is.na(specif$subset.var.val) & specif$subset.var.val != "") {
      data.s <- data.s[data.s[[specif$subset.var]] %in% specif$subset.var.val,]
    }        
    
    if (length(unique(data.s$x)) < 2) {
      if (debug) print("not unique `x` values")
      return(specif)
    }
    if (length(unique(data.s$y)) < 2) {
      if (debug) print("not enough unique `y` values")
      return(specif)
    }
    if (nrow(data.s) < 30) {
      if (debug) print("not enough observations")
      return(specif)
    }

    ## ++ estimate OLS alignment given specification
    if (debug) print("running `lm.func`")
    # if (debug) {
    #   fit <- lm.func(formula = as.formula(specif$formula),
    #                  x=TRUE, y=TRUE,
    #                  data = data.s,
    #                  weights = data.s$w)
    # } else {
      fit <- lm.func(formula = as.formula(specif$formula),
                     x=TRUE, y=TRUE,
                     data = data.s,
                     weights = data.s$w,
                     ...)
    # }
    
    if (debug) print(fit)
    
    ## ++ estimate correlations in different ways
    if (debug) print("running `cor`")
    rho          <- cor(data.s$x, data.s$y, use="pairwise.complete.obs")
    rho.kendall  <- cor(data.s$x, data.s$y, use="pairwise.complete.obs", method="kendall")
    rho.spearman <- cor(data.s$x, data.s$y, use="pairwise.complete.obs", method="spearman")    
    
    ## ++ adjust standard errors
    if (debug) print("running `coeftest`")
    if (is.na(specif$se.clus) | specif$se.clus == "") {
      fit.se <- coeftest(fit, type = "HC0", vcov. = vcovCL)
    } else {
      fit.se <- coeftest(fit, type = "HC0", vcov. = vcovCL, cluster = as.formula(paste0("~",specif$se.clus)))
    }    
    fit.df <- tibble() %>%
      do(fit.obj.se = fit.se) %>%
      bind_cols(fit.se %>%
                  broom::tidy()) %>% 
      filter(term != "(Intercept)") %>%
      mutate(n.obs = nobs(fit))
    fit.df$fit.obj <- list(fit)
    fit.df$fit.obj.class <- paste(class(fit), collapse = ".")
    
    if (!is.na(specif$x.mid) & !is.na(specif$y.mid) & quad.stats) {
      ## ++ estimate quadrant statistics
      if (debug) print("estimating quadrant stats")
      data.s$q4 <- data.s$x > specif$x.mid & data.s$y < specif$y.mid
      data.s$q3 <- data.s$x < specif$x.mid & data.s$y < specif$y.mid
      data.s$q2 <- data.s$x < specif$x.mid & data.s$y > specif$y.mid
      data.s$q1 <- data.s$x > specif$x.mid & data.s$y > specif$y.mid
      
      q4.n <- sum(data.s$q4,na.rm=T)
      q3.n <- sum(data.s$q3,na.rm=T)
      q2.n <- sum(data.s$q2,na.rm=T)
      q1.n <- sum(data.s$q1,na.rm=T)
      
      qcr <- ((q1.n+q3.n) - (q2.n+q4.n))/(q1.n+q2.n+q3.n+q4.n)
      fit.df$quad.count.ratio <- qcr
      
      fit.df$quad.rt.n <- q1.n
      fit.df$quad.lt.n <- q2.n
      fit.df$quad.lb.n <- q3.n
      fit.df$quad.rb.n <- q4.n
      
      fit.df$quad.rt.pct <- q1.n/(q1.n+q2.n+q3.n+q4.n)
      fit.df$quad.lt.pct <- q2.n/(q1.n+q2.n+q3.n+q4.n)
      fit.df$quad.lb.pct <- q3.n/(q1.n+q2.n+q3.n+q4.n)
      fit.df$quad.rb.pct <- q4.n/(q1.n+q2.n+q3.n+q4.n)
      
      ## ++ estimate quadrant alignment % bootstrapped
      q.align.pct.boots <- sapply(1:100, function(boot){
        data.s[sample(1:nrow(data.s), replace=T),] %>%
          with(., mean((x > specif$x.mid & y > specif$y.mid) | 
                       (x < specif$x.mid & y < specif$y.mid)))
      })
      q.align.pct.mean      <- mean(q.align.pct.boots)
      q.align.pct.ci.95.lwr <- quantile(q.align.pct.boots, probs=0.025)
      q.align.pct.ci.95.upr <- quantile(q.align.pct.boots, probs=0.975)
      fit.df$quad.align.pct.mean      <- q.align.pct.mean
      fit.df$quad.align.pct.ci.95.lwr <- q.align.pct.ci.95.lwr
      fit.df$quad.align.pct.ci.95.upr <- q.align.pct.ci.95.upr
      
      ## ++ estimate quadrant p-values
      if (debug) print("running `chisq.test` and `binom.test`")
      q.chisq.test <- chisq.test(table(data.s$x > specif$x.mid, data.s$y < specif$y.mid))
      
      q.binom.test <- binom.test(x = q1.n+q3.n,
                                 n = q1.n+q2.n+q3.n+q4.n,
                                 alternative = "greater")
      fit.df$quad.chisq.test <- list(q.chisq.test)
      fit.df$quad.chisq.pval <- q.chisq.test$p.value
      
      fit.df$quad.binom1s.test <- list(q.binom.test)
      fit.df$quad.binom1s.pval <- q.binom.test$p.value
      
    }
    
    ## ++ equivalence tests
    if (equiv.test) {
      
      
      tost.0.36.u <- tost(x = scale(data.s$x,center=T), y = scale(data.s$y,center=T), epsilon = 0.36) ## Hartman and Hidalgo
      tost.0.25.u <- tost(x = scale(data.s$x,center=T), y = scale(data.s$y,center=T), epsilon = 0.25) ## Ho et al./Imbens and Rubin
      tost.0.20.u <- tost(x = scale(data.s$x,center=T), y = scale(data.s$y,center=T), epsilon = 0.20) ## Cohen
      tost.0.36.l <- tost(x = scale(data.s$x,center=T), y = scale(data.s$y,center=T), epsilon = -0.36) ## Hartman and Hidalgo
      tost.0.25.l <- tost(x = scale(data.s$x,center=T), y = scale(data.s$y,center=T), epsilon = -0.25) ## Ho et al./Imbens and Rubin
      tost.0.20.l <- tost(x = scale(data.s$x,center=T), y = scale(data.s$y,center=T), epsilon = -0.20) ## Cohen
      
      fit.df$tost.0.36.u <- list(tost.0.36.u)
      fit.df$tost.0.25.u <- list(tost.0.25.u)
      fit.df$tost.0.20.u <- list(tost.0.20.u)
      fit.df$tost.0.36.l <- list(tost.0.36.l)
      fit.df$tost.0.25.l <- list(tost.0.25.l)
      fit.df$tost.0.20.l <- list(tost.0.20.l)
      
      fit.df$tost.0.36.u.pval <- tost.0.36.u$tost.p.value
      fit.df$tost.0.25.u.pval <- tost.0.25.u$tost.p.value
      fit.df$tost.0.20.u.pval <- tost.0.20.u$tost.p.value
      fit.df$tost.0.36.l.pval <- tost.0.36.l$tost.p.value
      fit.df$tost.0.25.l.pval <- tost.0.25.l$tost.p.value
      fit.df$tost.0.20.l.pval <- tost.0.20.l$tost.p.value
      
      fit.df$tost.0.36.u.ci.95.upr <- tost.0.36.u$tost.interval[[1]]
      fit.df$tost.0.36.u.ci.95.lwr <- tost.0.36.u$tost.interval[[2]]
      fit.df$tost.0.25.u.ci.95.upr <- tost.0.25.u$tost.interval[[1]]
      fit.df$tost.0.25.u.ci.95.lwr <- tost.0.25.u$tost.interval[[2]]
      fit.df$tost.0.20.u.ci.95.upr <- tost.0.20.u$tost.interval[[1]]
      fit.df$tost.0.20.u.ci.95.lwr <- tost.0.20.u$tost.interval[[2]]
      
      fit.df$tost.0.36.l.ci.95.upr <- tost.0.36.l$tost.interval[[1]]
      fit.df$tost.0.36.l.ci.95.lwr <- tost.0.36.l$tost.interval[[2]]
      fit.df$tost.0.25.l.ci.95.upr <- tost.0.25.l$tost.interval[[1]]
      fit.df$tost.0.25.l.ci.95.lwr <- tost.0.25.l$tost.interval[[2]]
      fit.df$tost.0.20.l.ci.95.upr <- tost.0.20.l$tost.interval[[1]]
      fit.df$tost.0.20.l.ci.95.lwr <- tost.0.20.l$tost.interval[[2]]
      
      fit.df$tost.ref.est <- unname(lm(scale(data.s$y,center=T) ~ scale(data.s$x,center=T))$coefficients[2])
    
    }    
    
    ## ++ estimate influence statistics
    if (debug) print("running `zaminfl` funcs")
    if (!parallelize & zaminfl) {
      fit.zaminfl.summary <- zaminfl_summary(fit, 
                                             x.var = names(fit$coefficients)[2])
      
      fit.df <- bind_cols(fit.df,
                          rename_all(fit.zaminfl.summary$short, ~paste0("zaminfl.",.x)))
      
      fit.df$zaminfl.summary.long <- list(fit.zaminfl.summary$long)
    }
    
    fit.df <- bind_cols(fit.df, specif)
    return(fit.df)
  })
  out.df <- bind_rows(out.list)
  
  if (parallelize & zaminfl) {
    message("Running influence statistics")
    zaminfl.out <- pblapply(1:nrow(out.df), function(i) {
      if (debug) print(i)
      fit.zaminfl.summary <- zaminfl_summary(out.df[i,]$fit.obj[[1]], 
                                             x.var = names(out.df[i,]$fit.obj[[1]]$coefficients)[2])
      
      return(rename_all(fit.zaminfl.summary$short, ~paste0("zaminfl.",.x)))
    })
    zaminfl.out.df <- bind_rows(zaminfl.out)
    out.df <- bind_cols(out.df, zaminfl.out.df)
  }
  
  ## ++ adjust p-values
  if ((p.adjust == "BHq" | p.adjust == TRUE) & any(!is.na(out.df$p.value))) {
    message("Adjusting p-values")
    
    out.df <- out.df %>%
      group_by_at(p.adjust.group.vars) %>%
      arrange(p.value) %>%
      mutate(p.value.adj.method = "BHq",
             k = n(), ## number of hypotheses
             r = 1:n(), ## rank of p-values
             p.value.adj.alpha = (r*p.adjust.alpha)/k, ## stepped-up thresholds
             p.value.adj.sig = p.value < p.value.adj.alpha, ## stepped-up hypothesis tests
             p.value.adj.zcrit = qnorm(1 - (p.value.adj.alpha)/2), ## new critical values for asymptotic CIs
             p.value.adj.tcrit = qt(1 - (p.value.adj.alpha)/2, n.obs-1) ## new critical values for small-sample CIs
      ) %>%
      select(-k, -r) %>%
      relocate((starts_with("p.value.adj")), .after = p.value) %>%
      arrange_at(p.adjust.group.vars)
  }
  
  ## ++ adjust p-values for equivalence tests (Hartman & Hidalgo)
  if ((p.adjust == "BHq" | p.adjust == TRUE) & equiv.test & any(!is.na(out.df$tost.0.20.u.pval))) {
    message("Adjusting p-values (TOSTs, ε=0.20)")
    
    out.df <- out.df %>%
      rowwise() %>%
      mutate(tost.0.20.pval = min(tost.0.20.l.pval, tost.0.20.u.pval, na.rm=T)) %>%  
      group_by_at(p.adjust.group.vars) %>%
      arrange(tost.0.20.pval) %>%
      mutate(p.value.adj.method = "BHq",
             k = n(), ## number of hypotheses
             r = 1:n(), ## rank of p-values
             tost.0.20.u.p.value.adj.alpha = (r*GLOBAL_ALPHA_THRESHOLD)/k, ## stepped-up thresholds
             tost.0.20.u.p.value.adj.sig = tost.0.20.pval < tost.0.20.u.p.value.adj.alpha, ## stepped-up hypothesis tests
             tost.0.20.u.p.value.adj.zcrit = qnorm(1 - (tost.0.20.u.p.value.adj.alpha)/2), ## new critical values for asymptotic CIs
             tost.0.20.u.p.value.adj.tcrit = qt(1 - (tost.0.20.u.p.value.adj.alpha)/2, n.obs-1) ## new critical values for small-sample CIs
      ) %>%
      select(-k, -r) %>%
      ungroup()
  }
  if ((p.adjust == "BHq" | p.adjust == TRUE) & equiv.test & any(!is.na(out.df$tost.0.25.u.pval))) {
    message("Adjusting p-values (TOSTs, ε=0.25)")
    
    out.df <- out.df %>%
      rowwise() %>%
      mutate(tost.0.25.pval = min(tost.0.25.l.pval, tost.0.25.u.pval, na.rm=T)) %>%  
      group_by_at(p.adjust.group.vars) %>%
      arrange(tost.0.25.pval) %>%
      mutate(p.value.adj.method = "BHq",
             k = n(), ## number of hypotheses
             r = 1:n(), ## rank of p-values
             tost.0.25.u.p.value.adj.alpha = (r*GLOBAL_ALPHA_THRESHOLD)/k, ## stepped-up thresholds
             tost.0.25.u.p.value.adj.sig = tost.0.25.pval < tost.0.25.u.p.value.adj.alpha, ## stepped-up hypothesis tests
             tost.0.25.u.p.value.adj.zcrit = qnorm(1 - (tost.0.25.u.p.value.adj.alpha)/2), ## new critical values for asymptotic CIs
             tost.0.25.u.p.value.adj.tcrit = qt(1 - (tost.0.25.u.p.value.adj.alpha)/2, n.obs-1) ## new critical values for small-sample CIs
      ) %>%
      select(-k, -r) %>%
      ungroup()
  }
  if ((p.adjust == "BHq" | p.adjust == TRUE) & equiv.test & any(!is.na(out.df$tost.0.36.u.pval))) {
    message("Adjusting p-values (TOSTs, ε=0.36)")
    
    out.df <- out.df %>%
      rowwise() %>%
      mutate(tost.0.36.pval = min(tost.0.36.l.pval, tost.0.36.u.pval, na.rm=T)) %>%  
      group_by_at(p.adjust.group.vars) %>%
      arrange(tost.0.36.pval) %>%
      mutate(p.value.adj.method = "BHq",
             k = n(), ## number of hypotheses
             r = 1:n(), ## rank of p-values
             tost.0.36.u.p.value.adj.alpha = (r*GLOBAL_ALPHA_THRESHOLD)/k, ## stepped-up thresholds
             tost.0.36.u.p.value.adj.sig = tost.0.36.pval < tost.0.36.u.p.value.adj.alpha, ## stepped-up hypothesis tests
             tost.0.36.u.p.value.adj.zcrit = qnorm(1 - (tost.0.36.u.p.value.adj.alpha)/2), ## new critical values for asymptotic CIs
             tost.0.36.u.p.value.adj.tcrit = qt(1 - (tost.0.36.u.p.value.adj.alpha)/2, n.obs-1) ## new critical values for small-sample CIs
      ) %>%
      select(-k, -r) %>%
      ungroup()
  }  
  print(Sys.time()-t.0)
  return(out.df)
}

zaminfl_summary <- function(fit.obj, x.var) {
  ##
  ## Summarize output of `zaminfluence` package [(Broderick Et Al.)](https://arxiv.org/abs/2011.14999)
  ##
  if (is.null(fit.obj)) {
    return(list(short=data.frame(
      sign.change.prop_drop=NA,
      sign.change.n_drop=NA,
      
      sig.change.prop_drop=NA,
      sig.change.n_drop=NA,
      
      both.change.prop_drop=NA,
      both.change.n_drop=NA
    ),
    long=data.frame()))  
  }
  # if (is.na(fit.obj)) {
  #   return(list(short=data.frame(
  #     sign.change.prop_drop=NA,
  #     sign.change.n_drop=NA,
  #     
  #     sig.change.prop_drop=NA,
  #     sig.change.n_drop=NA,
  #     
  #     both.change.prop_drop=NA,
  #     both.change.n_drop=NA
  #   ),
  #   long=data.frame()))  
  # }
  # Get influence scores for chosen regressors
  model.grads <-
    ComputeModelInfluence(fit.obj) %>%
    AppendTargetRegressorInfluence(x.var)

  # Compute the changes needed to change sign, significance, and both
  signals <- GetInferenceSignals(model.grads) #want signal to be large (i.e. large changes in input produce large differences in output)
  if (!signals[[x.var]]$sig$apip$success) {
    warning("Could not find an APIP to reverse significance")
    return(list(short=data.frame(
      sign.change.prop_drop=NA,
      sign.change.n_drop=NA,
      
      sig.change.prop_drop=NA,
      sig.change.n_drop=NA,
      
      both.change.prop_drop=NA,
      both.change.n_drop=NA
    ),
    long=data.frame()))
  } else {
  
    # Predict the changes, re-run the model at the left-out points, and
    # compare the two.
    preds <- PredictForSignals(signals, model.grads) #have model predict outputs using just influential set
    preds.df <- GetSignalsAndRerunsDataframe(signals, preds, model.grads)
    
    reruns <- RerunForSignals(signals, model.grads) #re-fit model using just the influential set
    reruns.df <- GetSignalsAndRerunsDataframe(signals, reruns, model.grads)

    summary.df <-
      rbind(reruns.df %>% mutate(method="rerun"),
            preds.df %>% mutate(method="prediction")) %>%
      pivot_wider(names_from=method, values_from=value)
    
    short.df <- data.frame(
      sign.change.prop_drop=signals[[x.var]]$sign$apip$prop,
      sign.change.n_drop=signals[[x.var]]$sign$apip$n,
      
      sig.change.prop_drop=signals[[x.var]]$sig$apip$prop,
      sig.change.n_drop=signals[[x.var]]$sig$apip$n,
      
      both.change.prop_drop=signals[[x.var]]$both$apip$prop,
      both.change.n_drop=signals[[x.var]]$both$apip$n
    )
    return(list(short=short.df,
                long=summary.df))
  }
}

build_specifs <- function(data,
                          y.vars, 
                          x.vars,
                          y.mids = c(),
                          x.mids = c(),
                          control.vars = c(),
                          subset.vars = c(),
                          weight.vars = c(),
                          se.cluster.vars = c(),
                          bin.y.missing = FALSE,
                          bin.x.missing = FALSE,
                          scale.y = TRUE,
                          scale.x = TRUE,
                          sign.y = FALSE,
                          sign.x = FALSE) {
  ##
  ## Build a dataframe of specifications to pass into
  ## `measure_*` functions.
  ##
  y.mids <- unlist(y.mids)
  x.mids <- unlist(x.mids)
  
  if (!(all(is.na(y.mids))) | length(y.mids) > 0)
    specif.grid.y <- data.frame(y=y.vars, y.mid=unname(unlist(sapply(y.vars, function(.) ifelse(. %in% names(y.mids), y.mids[.], NA)))))
  else
    specif.grid.y <- data.frame(y=y.vars, y.mid=NA)
  
  if (!(all(is.na(x.mids))) | length(x.mids) > 0)
    specif.grid.x <- data.frame(x=x.vars, x.mid=unname(unlist(sapply(x.vars, function(.) ifelse(. %in% names(x.mids), x.mids[.], NA)))))
  else
    specif.grid.x <- data.frame(x=x.vars, x.mid=NA)
  
  specif.grid <- crossing(specif.grid.x, specif.grid.y)
  
  if (length(control.vars) != 0)
    specif.grid <- crossing(specif.grid, c=control.vars)
  else
    specif.grid <- mutate(specif.grid, c=NA)
  
  if (length(weight.vars) != 0)
    specif.grid <- crossing(specif.grid, w=weight.vars)
  else
    specif.grid <- mutate(specif.grid, w=NA)
  
  if (length(se.cluster.vars) != 0)
    specif.grid <- crossing(specif.grid, se.clus=se.cluster.vars)
  else
    specif.grid <- mutate(specif.grid, se.clus=NA)
  
  specif.grid$bin.y.missing <- bin.y.missing
  specif.grid$bin.x.missing <- bin.x.missing
  
  specif.grid$sign.y <- sign.y
  specif.grid$sign.x <- sign.x
  
  specif.grid.0 <- specif.grid
  if (!all(is.na(subset.vars)) & length(subset.vars) > 0) {
    for (subset.var in subset.vars) {
      specif.grid <- crossing(specif.grid, 
                              distinct_at(data, subset.var) %>%
                                gather(key="subset.var", value="subset.var.val"))
    }
    specif.grid <- filter(specif.grid, !is.na(subset.var.val))
    specif.grid <- bind_rows(specif.grid, 
                             specif.grid.0 %>%
                               mutate(subset.var.val = "all")) ### include specifications with all data    
  } else {
    specif.grid$subset.var <- NA
    specif.grid$subset.var.val <- "all"
  }
  rm(specif.grid.0)
  message(sprintf("%i specifications", nrow(specif.grid)))
  
  specifs <- bind_rows(pblapply(1:nrow(specif.grid), function(s) {
    df <- specif.grid[s,]
    y_ <- paste0("`",df$y,"`")
    x_ <- paste0("`",df$x,"`")
    if (scale.y) y_ <- paste0("scale(",y_,")")
    if (scale.x) x_ <- paste0("scale(",x_,")")
    
    if (length(control.vars) != 0) {
      df$formula <- paste0(y_," ~ ", x_, " + ", df$c)
    } else {
      df$formula <- paste0(y_," ~ ", x_)
    }
    
    return(df)
  }))
  return(specifs)
}

my_boot <- function(x, times=1000) {
  # Get column name from input object
  var = deparse(substitute(x))
  var = gsub("^\\.\\$","", var)
  
  # Bootstrap 95% CI
  # cis = quantile(replicate(times, mean(sample(x, replace=TRUE))), probs=c(0.025,0.975))
  cis = quantile(replicate(times, log(`count`+LOG_OFFSET) %*% chi2), probs=c(0.025, 0.975))
  
  # Return data frame of results
  data.frame(var, n=length(x), mean=mean(x), lower.ci=cis[1], upper.ci=cis[2])
}

orient_idealpts_text <- function(df, ideal_col = "chi2", text_col = "feature", anchor_text_rgx="black.*women", anchor_sign=-1, scale=FALSE) {
  ## Make sure that lower values = liberal, higher values = conservative
  df <- orient_idealpts_brands(df, ideal_col, text_col, anchor_text_rgx, anchor_sign, scale)
  return(df)
}

orient_idealpts_brands <- function(df, ideal_col = "ideal", brand_col = "yougov_name", anchor_brand_rgx="Harley.*Davidson", anchor_sign=1, scale=FALSE) {
  ## Make sure that lower values = liberal, higher values = conservative
  if (scale == TRUE) df <- mutate_at(df, ideal_col, ~scale(.x)[,1])
  d0 <- unique(sign(df[grepl("Harley.*Davidson", df[[brand_col]], ignore.case=T),][[ideal_col]]))
  if (!all(d0 == anchor_sign)) {
    ### flip
    df <- mutate_at(df, ideal_col, ~-.x)
  }
  return(df)
}

## ++ Other ------------------------------------------------

save.obj <- function(obj, file.name){
  ## save to allow for a progress bar
  outfile <- file(file.name, "wb")
  serialize(object, outfile)
  close(outfile)
}

load.obj <- function(file.name){
  ## load with a progress bar
  filesize <- file.info(file.name)$size
  chunksize <- ceiling(filesize / 100)
  pb <- txtProgressBar(min = 0, max = 100, style=3)
  infile <- file(file.name, "rb")
  data <- foreach(it = 1:100, .combine = c) %do% {
    setTxtProgressBar(pb, it)
    readBin(infile, "raw", chunksize)
  }
  close(infile)
  close(pb)
  return(unserialize(data))
}

coalesce_join <- function(x, y, 
                          by = NULL, suffix = c(".x", ".y"), 
                          join = dplyr::full_join, ...) {
  joined <- join(x, y, by = by, suffix = suffix, ...)
  # names of desired output
  cols <- union(names(x), names(y))
  
  to_coalesce <- names(joined)[!names(joined) %in% cols]
  suffix_used <- suffix[ifelse(endsWith(to_coalesce, suffix[1]), 1, 2)]
  # remove suffixes and deduplicate
  to_coalesce <- unique(substr(
    to_coalesce, 
    1, 
    nchar(to_coalesce) - nchar(suffix_used)
  ))
  
  coalesced <- purrr::map_dfc(to_coalesce, ~dplyr::coalesce(
    joined[[paste0(.x, suffix[1])]], 
    joined[[paste0(.x, suffix[2])]]
  ))
  names(coalesced) <- to_coalesce
  
  dplyr::bind_cols(joined, coalesced)[cols]
}

message("✓")